# Load libraries
library(tidyverse); theme_set(theme_bw())
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fs)
# Input files
acc_file <- "/labs/kingsley/ambenj/TSIV/resources/240814_genomes_for_core_gene_analysis.txt"
core_genes_file <- "/labs/kingsley/ambenj/TSIV/resources/Eaton2007_iridovirus_core_genes.txt"
blastp_folder <- "/labs/kingsley/ambenj/TSIV/analysis/assembly/04_blastp_iridoviruses"

# Output files
core_gene_outfile <- "/labs/kingsley/ambenj/TSIV/analysis/assembly/04_blastp_iridoviruses/core_genes_needs_curation.txt"
genbank_gene_outfile <- "/labs/kingsley/ambenj/TSIV/analysis/assembly/04_blastp_iridoviruses/core_genes_genbank.txt"
TSIV_gene_outfile <- "/labs/kingsley/ambenj/TSIV/analysis/assembly/04_blastp_iridoviruses/core_genes_TSIV.txt"
# Read in accession file
acc <- read_tsv(acc_file) %>% 
  mutate(virus_accession = str_remove(File_prefix, '\\.1'))
## Rows: 76 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): Virus, Abbreviation, Genus, GenBank_accession, Ref, Annotations, Fi...
## dbl (2): Genome_Size, ORF_num
## lgl (1): Comments
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
acc
# Read in core genes file
core_genes <- read_tsv(core_genes_file)
## Rows: 26 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (13): Gene Name, FV3, TFV, ATV, SGIV, GIV, LCDV-1, LCDV-C, ISKNV, RBIV, ...
## dbl  (1): Order
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
core_genes

Get Eaton sequences for manual curations

# Read blastp results files
files <- dir_ls(path = blastp_folder, glob = "*_blastp.tsv")

# function to add file name to dataframe
read_and_record_filename <- function(filename){
  read_tsv(filename, col_names = c("qseqid", "sseqid", "evalue", "stitle", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "bitscore")) %>%
  mutate(filename = path_file(filename))
  }

# gather files into one dataframe
blastp <- map_df(files, read_and_record_filename) %>% 
  separate(qseqid, into=c("TSIV_ORF"), sep="\\|", remove=FALSE, extra="drop") %>% 
  mutate(TSIV_ORF = as.integer(str_remove(TSIV_ORF, "gene_"))) %>% 
  separate(filename, into=c("virus_accession"), sep="\\.", extra="drop")
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 101 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 197 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 181 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 206 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 111 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 107 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 201 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 95 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 116 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 190 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 197 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 198 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 200 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 200 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 198 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 193 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 194 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 197 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 197 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 193 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 188 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 101 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 203 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 203 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 205 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 58 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 193 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 201 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 202 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 201 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 195 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 197 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 201 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 198 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 205 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 191 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 201 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 202 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 209 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 176 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 174 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 174 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 179 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 104 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 196 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 189 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 120 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 210 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 203 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): qseqid, sseqid, stitle
## dbl (10): evalue, pident, length, mismatch, gapopen, qstart, qend, sstart, s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
blastp %>% 
  arrange(TSIV_ORF)
# Get top hit for each accession for each TSIV ORF
top_blastp <- blastp %>%
  filter(evalue < 0.1) %>% 
  group_by(TSIV_ORF, virus_accession) %>% 
  arrange(evalue) %>% 
  slice(1) %>% 
  ungroup() %>% 
  arrange(TSIV_ORF)
top_blastp
# Get core gene table
core_genes_blastp <- top_blastp %>% 
  select(qseqid, TSIV_ORF, sseqid, virus_accession) %>% 
  pivot_wider(id_cols = c("qseqid", "TSIV_ORF"), names_from = virus_accession, values_from = sseqid) %>% 
  separate(FV3_AY548484, into=c("FV3"), sep="\\(", remove=FALSE, extra="drop") %>% 
  left_join(core_genes, ., by=c("FV3")) %>% 
  filter(!is.na(Order), !TSIV_ORF %in% c(82, 29)) %>% 
  arrange(Order)

core_genes_blastp
core_genes_blastp %>% 
  write_tsv(core_gene_outfile)
# Get TSIV core genes
TSIV_core_genes_blastp <- core_genes_blastp%>% 
  select(Order, `Gene Name`, TSIV_ORF, qseqid)
TSIV_core_genes_blastp

Get genbank sourced annotations and TSIV for core genes

# Get top results for Genbank annotation viruses
genbank_core_genes <- top_blastp %>% 
  left_join(., acc, by="virus_accession") %>% 
  filter(Annotations=="Genbank", Core_protein_alignment=="yes") %>% 
  left_join(TSIV_core_genes_blastp, .)
## Joining with `by = join_by(TSIV_ORF, qseqid)`
genbank_core_genes
# Check all expected sequences are present
genbank_core_genes %>% 
  select(Order, `Gene Name`, qseqid, TSIV_ORF, sseqid, virus_accession) %>% 
  pivot_wider(id_cols = c("Order", "Gene Name", "qseqid", "TSIV_ORF"), names_from = virus_accession, values_from = sseqid)
# Write output in long format to get protein sequences for genbank accessions
genbank_core_genes %>% 
  select(Order, `Gene Name`, virus_accession, sseqid) %>% 
  arrange(virus_accession) %>% 
  write_tsv(genbank_gene_outfile)
 # Write output in long format to get protein sequences for TSIV
TSIV_core_genes_blastp %>% 
  select(Order, `Gene Name`, qseqid) %>% 
  write_tsv(TSIV_gene_outfile)